In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import networkx
import utils

In [2]:
from collections import defaultdict
from tqdm import tqdm

This are the mapped yago categories


In [3]:
CATEGORIES = [
    "wordnet_association_108049401",
    "wordnet_authorization_101138670",
    "wordnet_ban_107255401",
    "wordnet_case_law_106535035",
    "wordnet_civil_liberty_113995662",
    "wordnet_code_of_conduct_105668095",
    "wordnet_common_law_108453722",
    "wordnet_company_108058098",
    "wordnet_contract_106520944",
    "wordnet_cooperative_101100877",
    "wordnet_corporation_108059412",
    "wordnet_court_108329453",
    "wordnet_criminal_record_106490173",
    "wordnet_decree_106539770",
    "wordnet_delegating_101140839",
    "wordnet_directive_107170080",
    "wordnet_exemption_100213903",
    "wordnet_foundation_108406486",
    "wordnet_indebtedness_114490319",
    "wordnet_interdiction_107255299",
    "wordnet_judge_110225219",
    "wordnet_judiciary_108166187",
    "wordnet_law_106532330",
    "wordnet_law_108441203",
    "wordnet_lawyer_110249950",
    "wordnet_legal_code_106667792",
    "wordnet_legal_document_106479665",
    "wordnet_legislation_106535222",
    "wordnet_legislative_act_106564387",
    "wordnet_legislature_108163273",
    "wordnet_liability_114530403",
    "wordnet_liberty_113994456",
    "wordnet_liberty_113996061",
    "wordnet_limited_company_108185211",
    "wordnet_mandate_106556481",
    "wordnet_obligation_106773150",
    "wordnet_party_110402824",
    "wordnet_permission_106689297",
    "wordnet_pleading_106559365",
    "wordnet_prerogative_105178715",
    "wordnet_privilege_105158296",
    "wordnet_privilege_105179567",
    "wordnet_proclamation_101266491",
    "wordnet_prohibition_106542047",
    "wordnet_right_104850341",
    "wordnet_right_105174653",
    "wordnet_right_113341756",
    "wordnet_treaty_106773434",
    "wordnet_written_agreement_106771653",
]

In [4]:
RESOURCE_PREFIX = 'http://yago-knowledge.org/resource/'

In [17]:
def get_subclasses_number(category_name, filter_wikicat=False, populated=True):
    query = """SELECT DISTINCT (count(distinct ?subCategory) as ?count) WHERE {
        ?subCategory rdfs:subClassOf <%s%s> .""" % (RESOURCE_PREFIX, category_name)
    if populated:
        query += "?entity rdf:type ?subCategory ."
    if filter_wikicat:
        query += 'FILTER (!regex(str(?subCategory), "wikicat"))}'
    else:
        query += '}'
    response = utils.query_sparql(query, utils.YAGO_ENPOINT_URL)
    return response[1][0]

Let's count the number of subcategories there is for each mapped class. The first results includes all categories with entities, the second only wordnet_* categories


In [7]:
for category in CATEGORIES:
    count = get_subclasses_number(category, True)
    print category, get_subclasses_number(category), count


wordnet_association_108049401 208 9
wordnet_authorization_101138670 1 1
wordnet_ban_107255401 0 0
wordnet_case_law_106535035 0 0
wordnet_civil_liberty_113995662 1 0
wordnet_code_of_conduct_105668095 1 0
wordnet_common_law_108453722 0 0
wordnet_company_108058098 8411 21
wordnet_contract_106520944 2 1
wordnet_cooperative_101100877 62 0
wordnet_corporation_108059412 52 1
wordnet_court_108329453 166 6
wordnet_criminal_record_106490173 1 0
wordnet_decree_106539770 3 3
wordnet_delegating_101140839 0 0
wordnet_directive_107170080 4 0
wordnet_exemption_100213903 0 0
wordnet_foundation_108406486 1 1
wordnet_indebtedness_114490319 0 0
wordnet_interdiction_107255299 0 0
wordnet_judge_110225219 811 8
wordnet_judiciary_108166187 3 0
wordnet_law_106532330 3 3
wordnet_law_108441203 70 1
wordnet_lawyer_110249950 463 6
wordnet_legal_code_106667792 6 1
wordnet_legal_document_106479665 19 10
wordnet_legislation_106535222 0 0
wordnet_legislative_act_106564387 28 0
wordnet_legislature_108163273 44 2
wordnet_liability_114530403 0 0
wordnet_liberty_113994456 0 0
wordnet_liberty_113996061 0 0
wordnet_limited_company_108185211 0 0
wordnet_mandate_106556481 2 0
wordnet_obligation_106773150 0 0
wordnet_party_110402824 2 2
wordnet_permission_106689297 1 1
wordnet_pleading_106559365 1 1
wordnet_prerogative_105178715 0 0
wordnet_privilege_105158296 1 0
wordnet_privilege_105179567 0 0
wordnet_proclamation_101266491 0 0
wordnet_prohibition_106542047 3 0
wordnet_right_104850341 0 0
wordnet_right_105174653 201 1
wordnet_right_113341756 0 0
wordnet_treaty_106773434 1322 2
wordnet_written_agreement_106771653 2 2

In [20]:
wordnet_subclasses_count = 0
for category in tqdm(CATEGORIES):
    wordnet_subclasses_count += int(get_subclasses_number(category, True, populated=False))
print 'Total classes of YAGO mapped (with not-populated classes as well)', wordnet_subclasses_count + len(CATEGORIES)


100%|██████████| 49/49 [00:50<00:00,  1.00s/it]
Total classes of YAGO mapped (with not-populated classes as well) 358

Download the yago subclasses and extend the mapping. Use only populated classes.


In [6]:
# graph = networkx.DiGraph()
# for category in tqdm(CATEGORIES):
#     utils.add_subcategories(category, graph)


100%|██████████| 49/49 [03:30<00:00,  4.01s/it]

In [14]:
graph = utils.pickle_from_file('../../data/yago_hierarchy.pickle')

In [7]:
# utils.pickle_to_file(graph, '../../data/yago_hierarchy.pickle')

In [14]:
networkx.to_dict_of_lists(graph)


Out[14]:
{u'wordnet_accreditation_101140193': [],
 u'wordnet_accusation_107234230': [u'wordnet_allegation_107236077'],
 u'wordnet_act_106532095': [u'wordnet_legislative_act_106564387',
  u'wordnet_decree_106539770'],
 u'wordnet_advocate_109775663': [],
 u'wordnet_alcalde_109781804': [],
 u'wordnet_allegation_107236077': [],
 u'wordnet_appellate_court_108330106': [],
 'wordnet_association_108049401': [u'wordnet_league_108231184',
  u'wordnet_gang_108244062',
  u'wordnet_institute_108407330',
  u'wordnet_secret_society_108235343',
  u'wordnet_professional_association_108242675',
  u'wordnet_family_108227916',
  u'wordnet_consortium_108236438',
  u'wordnet_club_108227214',
  u'wordnet_chamber_of_commerce_108319061'],
 'wordnet_authorization_101138670': [u'wordnet_certification_101139830'],
 'wordnet_ban_107255401': [],
 u'wordnet_barrister_109840963': [u'wordnet_serjeant-at-law_110581890'],
 u'wordnet_baseball_league_108231874': [],
 u'wordnet_basketball_league_108232299': [],
 u'wordnet_bill_106536853': [],
 u'wordnet_bond_113417410': [u'wordnet_government_bond_113338234'],
 u'wordnet_broadcasting_company_108002015': [],
 u'wordnet_builder_109878275': [u'wordnet_contractor_109960688'],
 u'wordnet_bull_106726761': [],
 u'wordnet_bus_company_108186761': [],
 'wordnet_case_law_106535035': [],
 u'wordnet_certification_101139830': [u'wordnet_accreditation_101140193'],
 u'wordnet_chamber_of_commerce_108319061': [],
 u'wordnet_charge_106561942': [u'wordnet_accusation_107234230'],
 u'wordnet_charity_108406619': [],
 u'wordnet_check_113381734': [u'wordnet_kite_113382471'],
 u'wordnet_chess_club_108229275': [],
 u'wordnet_chief_justice_109916788': [],
 u'wordnet_civil_law_108453464': [u'wordnet_legislation_106535222'],
 'wordnet_civil_liberty_113995662': [],
 u'wordnet_civil_right_105182563': [],
 u'wordnet_closed_corporation_108383690': [u'wordnet_family_business_108383909'],
 u'wordnet_club_108227214': [u'wordnet_rowing_club_108230219',
  u'wordnet_sorority_108230477',
  u'wordnet_fraternity_108229467',
  u'wordnet_yacht_club_108230785',
  u'wordnet_racket_club_108230110',
  u'wordnet_glee_club_108229605',
  u'wordnet_chess_club_108229275',
  u'wordnet_golf_club_108229694'],
 'wordnet_code_of_conduct_105668095': [],
 u'wordnet_commercial_treaty_106773857': [],
 'wordnet_common_law_108453722': [],
 'wordnet_company_108058098': [u'wordnet_film_company_108003173',
  u'wordnet_steel_company_108003839',
  u'wordnet_transportation_company_108004089',
  u'wordnet_oil_company_108069241',
  u'wordnet_dot-com_108002384',
  u'wordnet_drug_company_108002578',
  u'wordnet_subsidiary_company_108003935',
  u'wordnet_think_tank_108478702',
  u'wordnet_furniture_company_108003525',
  u'wordnet_mining_company_108003619',
  u'wordnet_electronics_company_108003035',
  u'wordnet_printing_concern_108069627',
  u'wordnet_holding_company_108185369',
  u'wordnet_broadcasting_company_108002015',
  u'wordnet_service_108186047',
  u'wordnet_livery_company_108186898',
  u'wordnet_mover_108478482',
  u'wordnet_shipping_company_108003717',
  u'wordnet_packaging_company_108069342',
  u'wordnet_food_company_108003427',
  u'wordnet_pipeline_company_108069487'],
 u'wordnet_concession_106526619': [u'wordnet_franchise_106526811'],
 u'wordnet_consortium_108236438': [u'wordnet_trust_108236621'],
 'wordnet_contract_106520944': [u'wordnet_concession_106526619'],
 u'wordnet_contractor_109960688': [],
 u'wordnet_contractor_109960891': [u'wordnet_builder_109878275'],
 'wordnet_cooperative_101100877': [],
 'wordnet_corporation_108059412': [u'wordnet_closed_corporation_108383690'],
 'wordnet_court_108329453': [u'wordnet_superior_court_108335751',
  u'wordnet_supreme_court_108336188',
  u'wordnet_military_court_108334087',
  u'wordnet_appellate_court_108330106',
  u'wordnet_probate_court_108335087',
  u'wordnet_federal_court_108332330'],
 u'wordnet_court_order_106539502': [],
 u'wordnet_criminal_law_106539178': [],
 'wordnet_criminal_record_106490173': [],
 'wordnet_decree_106539770': [u'wordnet_prohibition_106542047',
  u'wordnet_bull_106726761',
  u'wordnet_imperial_decree_106541167'],
 u'wordnet_defense_attorney_110000158': [],
 'wordnet_delegating_101140839': [],
 u'wordnet_derivative_instrument_106480506': [u'wordnet_option_113241600'],
 'wordnet_directive_107170080': [],
 u'wordnet_district_attorney_110019072': [],
 u'wordnet_doge_110023264': [],
 u'wordnet_dot-com_108002384': [],
 u'wordnet_draft_113377268': [u'wordnet_check_113381734',
  u'wordnet_money_order_113380820'],
 u"wordnet_driver's_license_106550206": [],
 u'wordnet_drug_cartel_108236963': [],
 u'wordnet_drug_company_108002578': [],
 u'wordnet_electronics_company_108003035': [],
 'wordnet_exemption_100213903': [],
 u'wordnet_family_108227916': [],
 u'wordnet_family_business_108383909': [],
 u'wordnet_federal_court_108332330': [],
 u'wordnet_film_company_108003173': [],
 u'wordnet_food_company_108003427': [],
 u'wordnet_football_league_108232496': [],
 'wordnet_foundation_108406486': [u'wordnet_charity_108406619'],
 u'wordnet_franchise_106526811': [],
 u'wordnet_fraternity_108229467': [],
 u'wordnet_fundamental_law_106533648': [],
 u'wordnet_furniture_company_108003525': [],
 u'wordnet_gang_108244062': [],
 u'wordnet_gas_company_108186655': [],
 u'wordnet_glee_club_108229605': [],
 u'wordnet_golf_club_108229694': [],
 u'wordnet_government_bond_113338234': [],
 u'wordnet_hockey_league_108232603': [],
 u'wordnet_holding_company_108185369': [],
 u'wordnet_human_right_105176846': [u'wordnet_civil_right_105182563'],
 u'wordnet_imperial_decree_106541167': [],
 'wordnet_indebtedness_114490319': [],
 u'wordnet_institute_108407330': [],
 'wordnet_interdiction_107255299': [],
 'wordnet_judge_110225219': [u'wordnet_trial_judge_110728523',
  u'wordnet_ordinary_110382380',
  u'wordnet_chief_justice_109916788',
  u'wordnet_doge_110023264',
  u'wordnet_praetor_110463028',
  u'wordnet_alcalde_109781804',
  u'wordnet_justiciar_110228592',
  u'wordnet_magistrate_110280945'],
 'wordnet_judiciary_108166187': [],
 u'wordnet_justiciar_110228592': [],
 u'wordnet_kite_113382471': [],
 'wordnet_law_106532330': [u'wordnet_fundamental_law_106533648',
  u'wordnet_statute_of_limitations_106533484',
  u'wordnet_poor_law_106538785'],
 'wordnet_law_108441203': [u'wordnet_civil_law_108453464'],
 'wordnet_lawyer_110249950': [u'wordnet_advocate_109775663',
  u'wordnet_prosecutor_110484858',
  u'wordnet_defense_attorney_110000158',
  u'wordnet_trial_attorney_110728361',
  u'wordnet_barrister_109840963',
  u'wordnet_public_defender_110490557'],
 u'wordnet_league_108231184': [u'wordnet_football_league_108232496',
  u'wordnet_hockey_league_108232603',
  u'wordnet_baseball_league_108231874',
  u'wordnet_basketball_league_108232299'],
 'wordnet_legal_code_106667792': [u'wordnet_criminal_law_106539178'],
 'wordnet_legal_document_106479665': [u'wordnet_written_agreement_106771653',
  u'wordnet_law_106532330',
  u'wordnet_writ_106552984',
  u'wordnet_security_113416345',
  u'wordnet_act_106532095',
  u'wordnet_bill_106536853',
  u'wordnet_negotiable_instrument_106481156',
  u'wordnet_derivative_instrument_106480506',
  u'wordnet_mandate_106556481',
  u'wordnet_license_106549661'],
 u'wordnet_legislation_106535222': [],
 u'wordnet_legislative_act_106564387': [],
 'wordnet_legislature_108163273': [u'wordnet_parliament_108319198',
  u'wordnet_senate_108161477'],
 'wordnet_liability_114530403': [],
 'wordnet_liberty_113994456': [],
 'wordnet_liberty_113996061': [],
 u'wordnet_license_106549661': [u"wordnet_driver's_license_106550206"],
 'wordnet_limited_company_108185211': [],
 u'wordnet_litigant_110266848': [],
 u'wordnet_livery_company_108186898': [],
 u'wordnet_magistrate_110280945': [],
 u'wordnet_mandate_106556481': [],
 u'wordnet_military_court_108334087': [],
 u'wordnet_mining_company_108003619': [],
 u'wordnet_money_order_113380820': [],
 u'wordnet_mover_108478482': [],
 u'wordnet_negotiable_instrument_106481156': [u'wordnet_draft_113377268'],
 'wordnet_obligation_106773150': [],
 u'wordnet_oil_company_108069241': [],
 u'wordnet_option_113241600': [],
 u'wordnet_ordinary_110382380': [],
 u'wordnet_packaging_company_108069342': [],
 u'wordnet_parliament_108319198': [],
 'wordnet_party_110402824': [u'wordnet_contractor_109960891',
  u'wordnet_litigant_110266848'],
 u'wordnet_pass_106691083': [],
 u'wordnet_peace_106773976': [],
 'wordnet_permission_106689297': [u'wordnet_pass_106691083'],
 u'wordnet_pipeline_company_108069487': [],
 'wordnet_pleading_106559365': [u'wordnet_charge_106561942'],
 u'wordnet_poor_law_106538785': [],
 u'wordnet_power_company_108186393': [],
 u'wordnet_praetor_110463028': [],
 'wordnet_prerogative_105178715': [],
 u'wordnet_printing_concern_108069627': [],
 'wordnet_privilege_105158296': [],
 'wordnet_privilege_105179567': [],
 u'wordnet_probate_court_108335087': [],
 'wordnet_proclamation_101266491': [],
 u'wordnet_professional_association_108242675': [],
 u'wordnet_prohibition_106542047': [],
 u'wordnet_prosecutor_110484858': [u'wordnet_district_attorney_110019072',
  u"wordnet_state's_attorney_110649962"],
 u'wordnet_public_defender_110490557': [],
 u'wordnet_racket_club_108230110': [u'wordnet_tennis_club_108230590'],
 'wordnet_right_104850341': [],
 'wordnet_right_105174653': [u'wordnet_human_right_105176846'],
 'wordnet_right_113341756': [],
 u'wordnet_rowing_club_108230219': [],
 u'wordnet_secret_society_108235343': [],
 u'wordnet_security_113416345': [u'wordnet_bond_113417410'],
 u'wordnet_senate_108161477': [],
 u'wordnet_serjeant-at-law_110581890': [],
 u'wordnet_service_108186047': [u'wordnet_utility_108185758'],
 u'wordnet_shipping_company_108003717': [u'wordnet_trucking_company_108004210'],
 u'wordnet_sorority_108230477': [],
 u"wordnet_state's_attorney_110649962": [],
 u'wordnet_statute_of_limitations_106533484': [],
 u'wordnet_steel_company_108003839': [],
 u'wordnet_subsidiary_company_108003935': [],
 u'wordnet_superior_court_108335751': [],
 u'wordnet_supreme_court_108336188': [],
 u'wordnet_telephone_company_108186221': [],
 u'wordnet_tennis_club_108230590': [],
 u'wordnet_think_tank_108478702': [],
 u'wordnet_transportation_company_108004089': [u'wordnet_bus_company_108186761'],
 u'wordnet_treaty_106773434': [u'wordnet_peace_106773976',
  u'wordnet_commercial_treaty_106773857'],
 u'wordnet_trial_attorney_110728361': [],
 u'wordnet_trial_judge_110728523': [],
 u'wordnet_trucking_company_108004210': [],
 u'wordnet_trust_108236621': [u'wordnet_drug_cartel_108236963'],
 u'wordnet_utility_108185758': [u'wordnet_water_company_108186546',
  u'wordnet_bus_company_108186761',
  u'wordnet_gas_company_108186655',
  u'wordnet_power_company_108186393',
  u'wordnet_telephone_company_108186221'],
 u'wordnet_warrant_106547059': [],
 u'wordnet_water_company_108186546': [],
 u'wordnet_writ_106552984': [u'wordnet_warrant_106547059',
  u'wordnet_court_order_106539502'],
 u'wordnet_written_agreement_106771653': [u'wordnet_treaty_106773434',
  u'wordnet_contract_106520944'],
 u'wordnet_yacht_club_108230785': []}

Structure of the yago (sub)ontology

Let's add the new hierarchy to the old mapping


In [11]:
LKIF_TO_YAGO = {
    'Decree': [u'wordnet_prohibition_106542047', u'wordnet_decree_106539770'],
    'Proclamation': [u'wordnet_proclamation_101266491'],
    'Public_Body': [u'wordnet_court_108329453'],
    'Regulation': [
        u'wordnet_law_106532330', u'wordnet_legal_code_106667792', u'wordnet_legislation_106535222',
        u'wordnet_law_108441203'],
    'Immunity': [u'wordnet_exemption_100213903'],
    'Foundation': [u'wordnet_foundation_108406486'],
    'Permission': [u'wordnet_permission_106689297'],
    'Company': [u'wordnet_company_108058098'],
    'Prohibition': [u'wordnet_prohibition_106542047', u'wordnet_ban_107255401', u'wordnet_interdiction_107255299'],
    'Liability_Right': [u'wordnet_indebtedness_114490319'],
    'Legal_Doctrine': [u'wordnet_common_law_108453722', u'wordnet_case_law_106535035'],
    'Legal_Document': [
        u'wordnet_written_agreement_106771653', u'wordnet_criminal_record_106490173',
        u'wordnet_mandate_106556481', u'wordnet_legal_document_106479665'],
    'Corporation': [u'wordnet_corporation_108059412'],
    'Legislative_Body': [u'wordnet_legislature_108163273'],
    'Permissive_Right': [u'wordnet_authorization_101138670'],
    'Statute': [u'wordnet_legislative_act_106564387'],
    'Limited_Company': [u'wordnet_limited_company_108185211'],
    'Legal_Speech_Act': [u'wordnet_pleading_106559365'],
    'Contract': [u'wordnet_contract_106520944'],
    'Treaty': [u'wordnet_treaty_106773434'],
    'Legal_Role': [u'wordnet_party_110402824'],
    'Liberty_Right': [
        u'wordnet_privilege_105179567', u'wordnet_liberty_113996061', u'wordnet_prerogative_105178715',
        u'wordnet_civil_liberty_113995662', u'wordnet_privilege_105158296', u'wordnet_liberty_113994456'],
    'Obligation': [u'wordnet_obligation_106773150'],
    'Right': [u'wordnet_right_104850341', u'wordnet_right_105174653'],
    'Professional_Legal_Role': [
        u'wordnet_judge_110225219', u'wordnet_lawyer_110249950',
        u'wordnet_judiciary_108166187'],
    'Delegation': [u'wordnet_delegating_101140839'],
    'Society': [u'wordnet_association_108049401'],
    'Potestative_Right': [u'wordnet_right_113341756'],
    'Code_of_Conduct': [u'wordnet_code_of_conduct_105668095'],
}




In [16]:
print 'Number of LKIF classes mapped', len(LKIF_TO_YAGO)
print 'Number of YAGO classes (directly) mapped', sum([len(x) for x in LKIF_TO_YAGO.values()])
print 'Number of YAGO classes and populated subclasses mapped', len(graph.nodes())


Number of LKIF classes mapped 29
Number of YAGO classes (directly) mapped 47
Number of YAGO classes and populated subclasses mapped 169

In [17]:
def get_successors(graph, node):
    result = {}
    if not node in graph:
        print 'Weird, node {} not in graph'.format(node)
        return result
    for successor in graph.successors(node):
        result[successor] = get_successors(graph, successor)
    return result

In [18]:
subontology = defaultdict(dict)
for lkif_class, yago_classes in LKIF_TO_YAGO.iteritems():
    for yago_class in yago_classes:
        subontology[lkif_class][yago_class] = get_successors(graph, yago_class)

In [19]:
dict(subontology)


Out[19]:
{'Code_of_Conduct': {u'wordnet_code_of_conduct_105668095': {}},
 'Company': {u'wordnet_company_108058098': {u'wordnet_broadcasting_company_108002015': {},
   u'wordnet_dot-com_108002384': {},
   u'wordnet_drug_company_108002578': {},
   u'wordnet_electronics_company_108003035': {},
   u'wordnet_film_company_108003173': {},
   u'wordnet_food_company_108003427': {},
   u'wordnet_furniture_company_108003525': {},
   u'wordnet_holding_company_108185369': {},
   u'wordnet_livery_company_108186898': {},
   u'wordnet_mining_company_108003619': {},
   u'wordnet_mover_108478482': {},
   u'wordnet_oil_company_108069241': {},
   u'wordnet_packaging_company_108069342': {},
   u'wordnet_pipeline_company_108069487': {},
   u'wordnet_printing_concern_108069627': {},
   u'wordnet_service_108186047': {u'wordnet_utility_108185758': {u'wordnet_bus_company_108186761': {},
     u'wordnet_gas_company_108186655': {},
     u'wordnet_power_company_108186393': {},
     u'wordnet_telephone_company_108186221': {},
     u'wordnet_water_company_108186546': {}}},
   u'wordnet_shipping_company_108003717': {u'wordnet_trucking_company_108004210': {}},
   u'wordnet_steel_company_108003839': {},
   u'wordnet_subsidiary_company_108003935': {},
   u'wordnet_think_tank_108478702': {},
   u'wordnet_transportation_company_108004089': {u'wordnet_bus_company_108186761': {}}}},
 'Contract': {u'wordnet_contract_106520944': {u'wordnet_concession_106526619': {u'wordnet_franchise_106526811': {}}}},
 'Corporation': {u'wordnet_corporation_108059412': {u'wordnet_closed_corporation_108383690': {u'wordnet_family_business_108383909': {}}}},
 'Decree': {u'wordnet_decree_106539770': {u'wordnet_bull_106726761': {},
   u'wordnet_imperial_decree_106541167': {},
   u'wordnet_prohibition_106542047': {}},
  u'wordnet_prohibition_106542047': {}},
 'Delegation': {u'wordnet_delegating_101140839': {}},
 'Foundation': {u'wordnet_foundation_108406486': {u'wordnet_charity_108406619': {}}},
 'Immunity': {u'wordnet_exemption_100213903': {}},
 'Legal_Doctrine': {u'wordnet_case_law_106535035': {},
  u'wordnet_common_law_108453722': {}},
 'Legal_Document': {u'wordnet_criminal_record_106490173': {},
  u'wordnet_legal_document_106479665': {u'wordnet_act_106532095': {u'wordnet_decree_106539770': {u'wordnet_bull_106726761': {},
     u'wordnet_imperial_decree_106541167': {},
     u'wordnet_prohibition_106542047': {}},
    u'wordnet_legislative_act_106564387': {}},
   u'wordnet_bill_106536853': {},
   u'wordnet_derivative_instrument_106480506': {u'wordnet_option_113241600': {}},
   u'wordnet_law_106532330': {u'wordnet_fundamental_law_106533648': {},
    u'wordnet_poor_law_106538785': {},
    u'wordnet_statute_of_limitations_106533484': {}},
   u'wordnet_license_106549661': {u"wordnet_driver's_license_106550206": {}},
   u'wordnet_mandate_106556481': {},
   u'wordnet_negotiable_instrument_106481156': {u'wordnet_draft_113377268': {u'wordnet_check_113381734': {u'wordnet_kite_113382471': {}},
     u'wordnet_money_order_113380820': {}}},
   u'wordnet_security_113416345': {u'wordnet_bond_113417410': {u'wordnet_government_bond_113338234': {}}},
   u'wordnet_writ_106552984': {u'wordnet_court_order_106539502': {},
    u'wordnet_warrant_106547059': {}},
   u'wordnet_written_agreement_106771653': {u'wordnet_contract_106520944': {u'wordnet_concession_106526619': {u'wordnet_franchise_106526811': {}}},
    u'wordnet_treaty_106773434': {u'wordnet_commercial_treaty_106773857': {},
     u'wordnet_peace_106773976': {}}}},
  u'wordnet_mandate_106556481': {},
  u'wordnet_written_agreement_106771653': {u'wordnet_contract_106520944': {u'wordnet_concession_106526619': {u'wordnet_franchise_106526811': {}}},
   u'wordnet_treaty_106773434': {u'wordnet_commercial_treaty_106773857': {},
    u'wordnet_peace_106773976': {}}}},
 'Legal_Role': {u'wordnet_party_110402824': {u'wordnet_contractor_109960891': {u'wordnet_builder_109878275': {u'wordnet_contractor_109960688': {}}},
   u'wordnet_litigant_110266848': {}}},
 'Legal_Speech_Act': {u'wordnet_pleading_106559365': {u'wordnet_charge_106561942': {u'wordnet_accusation_107234230': {u'wordnet_allegation_107236077': {}}}}},
 'Legislative_Body': {u'wordnet_legislature_108163273': {u'wordnet_parliament_108319198': {},
   u'wordnet_senate_108161477': {}}},
 'Liability_Right': {u'wordnet_indebtedness_114490319': {}},
 'Liberty_Right': {u'wordnet_civil_liberty_113995662': {},
  u'wordnet_liberty_113994456': {},
  u'wordnet_liberty_113996061': {},
  u'wordnet_prerogative_105178715': {},
  u'wordnet_privilege_105158296': {},
  u'wordnet_privilege_105179567': {}},
 'Limited_Company': {u'wordnet_limited_company_108185211': {}},
 'Obligation': {u'wordnet_obligation_106773150': {}},
 'Permission': {u'wordnet_permission_106689297': {u'wordnet_pass_106691083': {}}},
 'Permissive_Right': {u'wordnet_authorization_101138670': {u'wordnet_certification_101139830': {u'wordnet_accreditation_101140193': {}}}},
 'Potestative_Right': {u'wordnet_right_113341756': {}},
 'Proclamation': {u'wordnet_proclamation_101266491': {}},
 'Professional_Legal_Role': {u'wordnet_judge_110225219': {u'wordnet_alcalde_109781804': {},
   u'wordnet_chief_justice_109916788': {},
   u'wordnet_doge_110023264': {},
   u'wordnet_justiciar_110228592': {},
   u'wordnet_magistrate_110280945': {},
   u'wordnet_ordinary_110382380': {},
   u'wordnet_praetor_110463028': {},
   u'wordnet_trial_judge_110728523': {}},
  u'wordnet_judiciary_108166187': {},
  u'wordnet_lawyer_110249950': {u'wordnet_advocate_109775663': {},
   u'wordnet_barrister_109840963': {u'wordnet_serjeant-at-law_110581890': {}},
   u'wordnet_defense_attorney_110000158': {},
   u'wordnet_prosecutor_110484858': {u'wordnet_district_attorney_110019072': {},
    u"wordnet_state's_attorney_110649962": {}},
   u'wordnet_public_defender_110490557': {},
   u'wordnet_trial_attorney_110728361': {}}},
 'Prohibition': {u'wordnet_ban_107255401': {},
  u'wordnet_interdiction_107255299': {},
  u'wordnet_prohibition_106542047': {}},
 'Public_Body': {u'wordnet_court_108329453': {u'wordnet_appellate_court_108330106': {},
   u'wordnet_federal_court_108332330': {},
   u'wordnet_military_court_108334087': {},
   u'wordnet_probate_court_108335087': {},
   u'wordnet_superior_court_108335751': {},
   u'wordnet_supreme_court_108336188': {}}},
 'Regulation': {u'wordnet_law_106532330': {u'wordnet_fundamental_law_106533648': {},
   u'wordnet_poor_law_106538785': {},
   u'wordnet_statute_of_limitations_106533484': {}},
  u'wordnet_law_108441203': {u'wordnet_civil_law_108453464': {u'wordnet_legislation_106535222': {}}},
  u'wordnet_legal_code_106667792': {u'wordnet_criminal_law_106539178': {}},
  u'wordnet_legislation_106535222': {}},
 'Right': {u'wordnet_right_104850341': {},
  u'wordnet_right_105174653': {u'wordnet_human_right_105176846': {u'wordnet_civil_right_105182563': {}}}},
 'Society': {u'wordnet_association_108049401': {u'wordnet_chamber_of_commerce_108319061': {},
   u'wordnet_club_108227214': {u'wordnet_chess_club_108229275': {},
    u'wordnet_fraternity_108229467': {},
    u'wordnet_glee_club_108229605': {},
    u'wordnet_golf_club_108229694': {},
    u'wordnet_racket_club_108230110': {u'wordnet_tennis_club_108230590': {}},
    u'wordnet_rowing_club_108230219': {},
    u'wordnet_sorority_108230477': {},
    u'wordnet_yacht_club_108230785': {}},
   u'wordnet_consortium_108236438': {u'wordnet_trust_108236621': {u'wordnet_drug_cartel_108236963': {}}},
   u'wordnet_family_108227916': {},
   u'wordnet_gang_108244062': {},
   u'wordnet_institute_108407330': {},
   u'wordnet_league_108231184': {u'wordnet_baseball_league_108231874': {},
    u'wordnet_basketball_league_108232299': {},
    u'wordnet_football_league_108232496': {},
    u'wordnet_hockey_league_108232603': {}},
   u'wordnet_professional_association_108242675': {},
   u'wordnet_secret_society_108235343': {}}},
 'Statute': {u'wordnet_legislative_act_106564387': {}},
 'Treaty': {u'wordnet_treaty_106773434': {u'wordnet_commercial_treaty_106773857': {},
   u'wordnet_peace_106773976': {}}}}

In [20]:
networkx.topological_sort(graph)


Out[20]:
['wordnet_proclamation_101266491',
 'wordnet_legislature_108163273',
 'wordnet_authorization_101138670',
 'wordnet_legal_document_106479665',
 u'wordnet_act_106532095',
 'wordnet_decree_106539770',
 'wordnet_liability_114530403',
 'wordnet_judge_110225219',
 u'wordnet_ordinary_110382380',
 u'wordnet_praetor_110463028',
 'wordnet_permission_106689297',
 u'wordnet_pass_106691083',
 'wordnet_law_108441203',
 'wordnet_law_106532330',
 u'wordnet_fundamental_law_106533648',
 'wordnet_company_108058098',
 u'wordnet_dot-com_108002384',
 u'wordnet_mining_company_108003619',
 u'wordnet_mover_108478482',
 'wordnet_party_110402824',
 'wordnet_civil_liberty_113995662',
 u'wordnet_packaging_company_108069342',
 u'wordnet_civil_law_108453464',
 u'wordnet_license_106549661',
 'wordnet_common_law_108453722',
 'wordnet_ban_107255401',
 u'wordnet_livery_company_108186898',
 'wordnet_judiciary_108166187',
 'wordnet_limited_company_108185211',
 'wordnet_privilege_105179567',
 'wordnet_pleading_106559365',
 u'wordnet_charge_106561942',
 u'wordnet_bull_106726761',
 u'wordnet_legislative_act_106564387',
 u'wordnet_imperial_decree_106541167',
 'wordnet_obligation_106773150',
 u'wordnet_litigant_110266848',
 u'wordnet_written_agreement_106771653',
 u'wordnet_mandate_106556481',
 'wordnet_directive_107170080',
 'wordnet_right_113341756',
 u'wordnet_printing_concern_108069627',
 'wordnet_case_law_106535035',
 u'wordnet_drug_company_108002578',
 'wordnet_cooperative_101100877',
 u'wordnet_senate_108161477',
 'wordnet_criminal_record_106490173',
 'wordnet_exemption_100213903',
 'wordnet_code_of_conduct_105668095',
 u'wordnet_alcalde_109781804',
 u'wordnet_electronics_company_108003035',
 u'wordnet_magistrate_110280945',
 u'wordnet_contractor_109960891',
 u'wordnet_builder_109878275',
 u'wordnet_contractor_109960688',
 u'wordnet_steel_company_108003839',
 'wordnet_right_104850341',
 'wordnet_corporation_108059412',
 'wordnet_liberty_113996061',
 'wordnet_court_108329453',
 u'wordnet_military_court_108334087',
 u'wordnet_probate_court_108335087',
 u'wordnet_think_tank_108478702',
 u'wordnet_treaty_106773434',
 u'wordnet_peace_106773976',
 u'wordnet_commercial_treaty_106773857',
 u'wordnet_food_company_108003427',
 u'wordnet_service_108186047',
 u'wordnet_utility_108185758',
 u'wordnet_water_company_108186546',
 u'wordnet_power_company_108186393',
 u'wordnet_telephone_company_108186221',
 u'wordnet_broadcasting_company_108002015',
 u'wordnet_holding_company_108185369',
 u'wordnet_gas_company_108186655',
 u'wordnet_appellate_court_108330106',
 u'wordnet_prohibition_106542047',
 'wordnet_interdiction_107255299',
 u"wordnet_driver's_license_106550206",
 'wordnet_privilege_105158296',
 u'wordnet_bill_106536853',
 u'wordnet_closed_corporation_108383690',
 u'wordnet_chief_justice_109916788',
 'wordnet_indebtedness_114490319',
 u'wordnet_superior_court_108335751',
 u'wordnet_negotiable_instrument_106481156',
 u'wordnet_derivative_instrument_106480506',
 u'wordnet_option_113241600',
 u'wordnet_draft_113377268',
 u'wordnet_check_113381734',
 u'wordnet_kite_113382471',
 u'wordnet_money_order_113380820',
 'wordnet_association_108049401',
 u'wordnet_league_108231184',
 u'wordnet_hockey_league_108232603',
 u'wordnet_baseball_league_108231874',
 u'wordnet_basketball_league_108232299',
 u'wordnet_gang_108244062',
 u'wordnet_institute_108407330',
 u'wordnet_secret_society_108235343',
 u'wordnet_family_108227916',
 u'wordnet_family_business_108383909',
 'wordnet_prerogative_105178715',
 u'wordnet_justiciar_110228592',
 'wordnet_delegating_101140839',
 'wordnet_lawyer_110249950',
 u'wordnet_advocate_109775663',
 u'wordnet_prosecutor_110484858',
 u'wordnet_district_attorney_110019072',
 u"wordnet_state's_attorney_110649962",
 u'wordnet_trial_attorney_110728361',
 u'wordnet_barrister_109840963',
 u'wordnet_serjeant-at-law_110581890',
 u'wordnet_public_defender_110490557',
 u'wordnet_furniture_company_108003525',
 u'wordnet_parliament_108319198',
 u'wordnet_poor_law_106538785',
 u'wordnet_club_108227214',
 u'wordnet_rowing_club_108230219',
 u'wordnet_fraternity_108229467',
 u'wordnet_yacht_club_108230785',
 u'wordnet_racket_club_108230110',
 u'wordnet_tennis_club_108230590',
 u'wordnet_glee_club_108229605',
 u'wordnet_chess_club_108229275',
 u'wordnet_certification_101139830',
 u'wordnet_transportation_company_108004089',
 u'wordnet_bus_company_108186761',
 'wordnet_right_105174653',
 u'wordnet_human_right_105176846',
 u'wordnet_civil_right_105182563',
 u'wordnet_pipeline_company_108069487',
 u'wordnet_statute_of_limitations_106533484',
 u'wordnet_chamber_of_commerce_108319061',
 u'wordnet_legislation_106535222',
 u'wordnet_federal_court_108332330',
 'wordnet_liberty_113994456',
 u'wordnet_subsidiary_company_108003935',
 u'wordnet_shipping_company_108003717',
 u'wordnet_accreditation_101140193',
 u'wordnet_doge_110023264',
 u'wordnet_trucking_company_108004210',
 'wordnet_foundation_108406486',
 u'wordnet_charity_108406619',
 u'wordnet_film_company_108003173',
 u'wordnet_accusation_107234230',
 'wordnet_contract_106520944',
 u'wordnet_concession_106526619',
 u'wordnet_franchise_106526811',
 u'wordnet_golf_club_108229694',
 'wordnet_legal_code_106667792',
 u'wordnet_criminal_law_106539178',
 u'wordnet_football_league_108232496',
 u'wordnet_consortium_108236438',
 u'wordnet_trust_108236621',
 u'wordnet_drug_cartel_108236963',
 u'wordnet_allegation_107236077',
 u'wordnet_trial_judge_110728523',
 u'wordnet_oil_company_108069241',
 u'wordnet_defense_attorney_110000158',
 u'wordnet_security_113416345',
 u'wordnet_bond_113417410',
 u'wordnet_government_bond_113338234',
 u'wordnet_supreme_court_108336188',
 u'wordnet_professional_association_108242675',
 u'wordnet_writ_106552984',
 u'wordnet_warrant_106547059',
 u'wordnet_court_order_106539502',
 u'wordnet_sorority_108230477']

Entities

Let's add the number of downloaded entities


In [21]:
counts = {
    'wordnet_accreditation_101140193': 2,
    'wordnet_accusation_107234230': 0,
    'wordnet_act_106532095': 749,
    'wordnet_advocate_109775663': 33,
    'wordnet_alcalde_109781804': 3,
    'wordnet_allegation_107236077': 16,
    'wordnet_appellate_court_108330106': 76,
    'wordnet_association_108049401': 2413,
    'wordnet_authorization_101138670': 0,
    'wordnet_ban_107255401': 0,
    'wordnet_barrister_109840963': 774,
    'wordnet_baseball_league_108231874': 216,
    'wordnet_basketball_league_108232299': 162,
    'wordnet_bill_106536853': 14,
    'wordnet_bond_113417410': 0,
    'wordnet_broadcasting_company_108002015': 228,
    'wordnet_builder_109878275': 195,
    'wordnet_bull_106726761': 179,
    'wordnet_bus_company_108186761': 310,
    'wordnet_case_law_106535035': 0,
    'wordnet_certification_101139830': 6,
    'wordnet_chamber_of_commerce_108319061': 170,
    'wordnet_charge_106561942': 0,
    'wordnet_charity_108406619': 2384,
    'wordnet_check_113381734': 0,
    'wordnet_chess_club_108229275': 54,
    'wordnet_chief_justice_109916788': 524,
    'wordnet_civil_law_108453464': 0,
    'wordnet_civil_liberty_113995662': 2,
    'wordnet_civil_right_105182563': 40,
    'wordnet_closed_corporation_108383690': 0,
    'wordnet_club_108227214': 30080,
    'wordnet_code_of_conduct_105668095': 60,
    'wordnet_commercial_treaty_106773857': 47,
    'wordnet_common_law_108453722': 0,
    'wordnet_company_108058098': 66792,
    'wordnet_concession_106526619': 6,
    'wordnet_consortium_108236438': 149,
    'wordnet_contract_106520944': 1,
    'wordnet_contractor_109960688': 24,
    'wordnet_contractor_109960891': 0,
    'wordnet_cooperative_101100877': 651,
    'wordnet_corporation_108059412': 352,
    'wordnet_court_108329453': 876,
    'wordnet_court_order_106539502': 9,
    'wordnet_criminal_law_106539178': 0,
    'wordnet_criminal_record_106490173': 10,
    'wordnet_decree_106539770': 0,
    'wordnet_defense_attorney_110000158': 44,
    'wordnet_delegating_101140839': 0,
    'wordnet_derivative_instrument_106480506': 0,
    'wordnet_directive_107170080': 189,
    'wordnet_district_attorney_110019072': 476,
    'wordnet_doge_110023264': 200,
    'wordnet_dot-com_108002384': 0,
    'wordnet_draft_113377268': 297,
    'wordnet_driver\'s_license_106550206': 65,
    'wordnet_drug_cartel_108236963': 14,
    'wordnet_drug_company_108002578': 576,
    'wordnet_electronics_company_108003035': 1284,
    'wordnet_exemption_100213903': 0,
    'wordnet_family_108227916': 15,
    'wordnet_family_business_108383909': 4,
    'wordnet_federal_court_108332330': 7,
    'wordnet_film_company_108003173': 113,
    'wordnet_food_company_108003427': 1483,
    'wordnet_football_league_108232496': 1852,
    'wordnet_foundation_108406486': 0,
    'wordnet_franchise_106526811': 31,
    'wordnet_fraternity_108229467': 346,
    'wordnet_fundamental_law_106533648': 527,
    'wordnet_furniture_company_108003525': 39,
    'wordnet_gang_108244062': 250,
    'wordnet_gas_company_108186655': 157,
    'wordnet_glee_club_108229605': 13,
    'wordnet_golf_club_108229694': 606,
    'wordnet_government_bond_113338234': 32,
    'wordnet_hockey_league_108232603': 494,
    'wordnet_holding_company_108185369': 211,
    'wordnet_human_right_105176846': 1011,
    'wordnet_imperial_decree_106541167': 0,
    'wordnet_indebtedness_114490319': 0,
    'wordnet_institute_108407330': 3475,
    'wordnet_interdiction_107255299': 0,
    'wordnet_judge_110225219': 5698,
    'wordnet_judiciary_108166187': 67,
    'wordnet_justiciar_110228592': 17,
    'wordnet_kite_113382471': 60,
    'wordnet_law_106532330': 0,
    'wordnet_law_108441203': 799,
    'wordnet_lawyer_110249950': 21654,
    'wordnet_league_108231184': 971,
    'wordnet_legal_code_106667792': 186,
    'wordnet_legal_document_106479665': 151,
    'wordnet_legislation_106535222': 991,
    'wordnet_legislative_act_106564387': 116,
    'wordnet_legislature_108163273': 1102,
    'wordnet_liability_114530403': 0,
    'wordnet_liberty_113994456': 0,
    'wordnet_liberty_113996061': 0,
    'wordnet_license_106549661': 57,
    'wordnet_limited_company_108185211': 0,
    'wordnet_litigant_110266848': 0,
    'wordnet_livery_company_108186898': 73,
    'wordnet_magistrate_110280945': 27,
    'wordnet_mandate_106556481': 17,
    'wordnet_military_court_108334087': 10,
    'wordnet_mining_company_108003619': 271,
    'wordnet_money_order_113380820': 26,
    'wordnet_mover_108478482': 20,
    'wordnet_negotiable_instrument_106481156': 0,
    'wordnet_obligation_106773150': 0,
    'wordnet_oil_company_108069241': 586,
    'wordnet_option_113241600': 56,
    'wordnet_ordinary_110382380': 14,
    'wordnet_packaging_company_108069342': 99,
    'wordnet_parliament_108319198': 375,
    'wordnet_party_110402824': 0,
    'wordnet_pass_106691083': 175,
    'wordnet_peace_106773976': 589,
    'wordnet_permission_106689297': 0,
    'wordnet_pipeline_company_108069487': 57,
    'wordnet_pleading_106559365': 0,
    'wordnet_poor_law_106538785': 14,
    'wordnet_power_company_108186393': 640,
    'wordnet_praetor_110463028': 148,
    'wordnet_prerogative_105178715': 0,
    'wordnet_printing_concern_108069627': 85,
    'wordnet_privilege_105158296': 3,
    'wordnet_privilege_105179567': 0,
    'wordnet_probate_court_108335087': 2,
    'wordnet_proclamation_101266491': 0,
    'wordnet_professional_association_108242675': 1270,
    'wordnet_prohibition_106542047': 16,
    'wordnet_prosecutor_110484858': 1795,
    'wordnet_public_defender_110490557': 4,
    'wordnet_racket_club_108230110': 0,
    'wordnet_right_104850341': 0,
    'wordnet_right_105174653': 997,
    'wordnet_right_113341756': 0,
    'wordnet_rowing_club_108230219': 193,
    'wordnet_secret_society_108235343': 292,
    'wordnet_security_113416345': 0,
    'wordnet_senate_108161477': 0,
    'wordnet_serjeant-at-law_110581890': 12,
    'wordnet_service_108186047': 0,
    'wordnet_shipping_company_108003717': 486,
    'wordnet_sorority_108230477': 22,
    'wordnet_state\'s_attorney_110649962': 79,
    'wordnet_statute_of_limitations_106533484': 3,
    'wordnet_steel_company_108003839': 259,
    'wordnet_subsidiary_company_108003935': 1149,
    'wordnet_superior_court_108335751': 13,
    'wordnet_supreme_court_108336188': 207,
    'wordnet_telephone_company_108186221': 528,
    'wordnet_tennis_club_108230590': 0,
    'wordnet_think_tank_108478702': 427,
    'wordnet_transportation_company_108004089': 26,
    'wordnet_treaty_106773434': 1848,
    'wordnet_trial_attorney_110728361': 1,
    'wordnet_trial_judge_110728523': 10,
    'wordnet_trucking_company_108004210': 39,
    'wordnet_trust_108236621': 16,
    'wordnet_utility_108185758': 208,
    'wordnet_warrant_106547059': 15,
    'wordnet_water_company_108186546': 71,
    'wordnet_writ_106552984': 35,
    'wordnet_written_agreement_106771653': 0,
    'wordnet_yacht_club_108230785': 305,
}

In [22]:
print 'Number of classes with at least one entity that has a wikipedia page', len([x for x, v in counts.iteritems() if v])


Number of classes with at least one entity that has a wikipedia page 126

In [23]:
non_zero_counts = sorted([x for x in counts.values() if x and x < 1000])
n, bins, patches = plt.hist(non_zero_counts, 50, facecolor='green', alpha=0.75)
plt.gca().set_xscale("log")
plt.show()



In [23]:
def get_total_count(graph, node, total_counts):
    """Returns the count of entities of the node and all subnodes."""
    if node in total_counts:
        return total_counts[node]
    total = 0
    for children in graph.successors(node):
        total += get_total_count(graph, children, total_counts)
    total += counts[node]
    total_counts[node] = total
    return total

In [24]:
total_counts = {}
for node in graph.nodes_iter():
    get_total_count(graph, node, total_counts)

Let's see how many entities are downloaded in a category and all its children.


In [25]:
for node, count in total_counts.iteritems():
    print '{}\t{}'.format(node, count)


wordnet_sorority_108230477	22
wordnet_writ_106552984	59
wordnet_professional_association_108242675	1270
wordnet_supreme_court_108336188	207
wordnet_security_113416345	32
wordnet_defense_attorney_110000158	44
wordnet_oil_company_108069241	586
wordnet_trial_judge_110728523	10
wordnet_allegation_107236077	16
wordnet_consortium_108236438	179
wordnet_football_league_108232496	1852
wordnet_legal_code_106667792	186
wordnet_criminal_law_106539178	0
wordnet_golf_club_108229694	606
wordnet_contract_106520944	38
wordnet_ordinary_110382380	14
wordnet_accusation_107234230	16
wordnet_film_company_108003173	113
wordnet_foundation_108406486	2384
wordnet_trucking_company_108004210	39
wordnet_doge_110023264	200
wordnet_fraternity_108229467	346
wordnet_shipping_company_108003717	525
wordnet_subsidiary_company_108003935	1149
wordnet_liberty_113994456	0
wordnet_federal_court_108332330	7
wordnet_legislation_106535222	991
wordnet_chamber_of_commerce_108319061	170
wordnet_litigant_110266848	0
wordnet_check_113381734	60
wordnet_pipeline_company_108069487	57
wordnet_right_105174653	2048
wordnet_limited_company_108185211	0
wordnet_certification_101139830	8
wordnet_club_108227214	31619
wordnet_poor_law_106538785	14
wordnet_parliament_108319198	375
wordnet_tennis_club_108230590	0
wordnet_furniture_company_108003525	39
wordnet_lawyer_110249950	24872
wordnet_civil_right_105182563	40
wordnet_delegating_101140839	0
wordnet_justiciar_110228592	17
wordnet_prerogative_105178715	0
wordnet_legislative_act_106564387	116
wordnet_association_108049401	43378
wordnet_concession_106526619	37
wordnet_trust_108236621	30
wordnet_human_right_105176846	1051
wordnet_draft_113377268	383
wordnet_derivative_instrument_106480506	56
wordnet_drug_company_108002578	576
wordnet_negotiable_instrument_106481156	383
wordnet_league_108231184	3695
wordnet_superior_court_108335751	13
wordnet_bus_company_108186761	310
wordnet_indebtedness_114490319	0
wordnet_chief_justice_109916788	524
wordnet_closed_corporation_108383690	4
wordnet_bill_106536853	14
wordnet_privilege_105158296	3
wordnet_franchise_106526811	31
wordnet_driver's_license_106550206	65
wordnet_interdiction_107255299	0
wordnet_prohibition_106542047	16
wordnet_gang_108244062	250
wordnet_bond_113417410	32
wordnet_yacht_club_108230785	305
wordnet_appellate_court_108330106	76
wordnet_racket_club_108230110	0
wordnet_gas_company_108186655	157
wordnet_holding_company_108185369	211
wordnet_broadcasting_company_108002015	228
wordnet_service_108186047	1914
wordnet_food_company_108003427	1483
wordnet_court_order_106539502	9
wordnet_treaty_106773434	2484
wordnet_think_tank_108478702	427
wordnet_court_108329453	1191
wordnet_statute_of_limitations_106533484	3
wordnet_trial_attorney_110728361	1
wordnet_liberty_113996061	0
wordnet_corporation_108059412	356
wordnet_right_104850341	0
wordnet_steel_company_108003839	259
wordnet_contractor_109960891	219
wordnet_magistrate_110280945	27
wordnet_electronics_company_108003035	1284
wordnet_advocate_109775663	33
wordnet_drug_cartel_108236963	14
wordnet_utility_108185758	1914
wordnet_code_of_conduct_105668095	60
wordnet_exemption_100213903	0
wordnet_serjeant-at-law_110581890	12
wordnet_alcalde_109781804	3
wordnet_senate_108161477	0
wordnet_telephone_company_108186221	528
wordnet_cooperative_101100877	651
wordnet_baseball_league_108231874	216
wordnet_government_bond_113338234	32
wordnet_family_108227916	15
wordnet_case_law_106535035	0
wordnet_money_order_113380820	26
wordnet_printing_concern_108069627	85
wordnet_right_113341756	0
wordnet_basketball_league_108232299	162
wordnet_directive_107170080	189
wordnet_mandate_106556481	17
wordnet_warrant_106547059	15
wordnet_secret_society_108235343	292
wordnet_written_agreement_106771653	2522
wordnet_power_company_108186393	640
wordnet_obligation_106773150	0
wordnet_imperial_decree_106541167	0
wordnet_family_business_108383909	4
wordnet_bull_106726761	179
wordnet_commercial_treaty_106773857	47
wordnet_pleading_106559365	16
wordnet_water_company_108186546	71
wordnet_charity_108406619	2384
wordnet_barrister_109840963	786
wordnet_privilege_105179567	0
wordnet_rowing_club_108230219	193
wordnet_institute_108407330	3475
wordnet_kite_113382471	60
wordnet_judiciary_108166187	67
wordnet_prosecutor_110484858	2350
wordnet_state's_attorney_110649962	79
wordnet_glee_club_108229605	13
wordnet_contractor_109960688	24
wordnet_hockey_league_108232603	494
wordnet_livery_company_108186898	73
wordnet_criminal_record_106490173	10
wordnet_ban_107255401	0
wordnet_common_law_108453722	0
wordnet_license_106549661	122
wordnet_civil_law_108453464	991
wordnet_peace_106773976	589
wordnet_civil_liberty_113995662	2
wordnet_builder_109878275	219
wordnet_chess_club_108229275	54
wordnet_party_110402824	219
wordnet_mover_108478482	20
wordnet_company_108058098	76527
wordnet_accreditation_101140193	2
wordnet_law_106532330	544
wordnet_probate_court_108335087	2
wordnet_law_108441203	1790
wordnet_dot-com_108002384	0
wordnet_mining_company_108003619	271
wordnet_fundamental_law_106533648	527
wordnet_permission_106689297	175
wordnet_judge_110225219	6641
wordnet_liability_114530403	0
wordnet_charge_106561942	16
wordnet_public_defender_110490557	4
wordnet_transportation_company_108004089	336
wordnet_decree_106539770	195
wordnet_military_court_108334087	10
wordnet_district_attorney_110019072	476
wordnet_legal_document_106479665	4960
wordnet_act_106532095	1060
wordnet_praetor_110463028	148
wordnet_authorization_101138670	8
wordnet_legislature_108163273	1477
wordnet_option_113241600	56
wordnet_pass_106691083	175
wordnet_proclamation_101266491	0
wordnet_packaging_company_108069342	99

Compare the new downloads with number of entities in previous downloads


In [26]:
previous_counts = {
    'wordnet_association_108049401': 43389,
    'wordnet_authorization_101138670': 8,
    'wordnet_ban_107255401': 0,
    'wordnet_case_law_106535035': 0,
    'wordnet_civil_liberty_113995662': 9,
    'wordnet_code_of_conduct_105668095': 67,
    'wordnet_common_law_108453722': 0,
    'wordnet_company_108058098': 77694,
    'wordnet_contract_106520944': 38,
    'wordnet_cooperative_101100877': 734,
    'wordnet_corporation_108059412': 391,
    'wordnet_court_108329453': 1200,
    'wordnet_criminal_record_106490173': 10,
    'wordnet_decree_106539770': 197,
    'wordnet_delegating_101140839': 0,
    'wordnet_directive_107170080': 231,
    'wordnet_exemption_100213903': 0,
    'wordnet_foundation_108406486': 2525,
    'wordnet_indebtedness_114490319': 0,
    'wordnet_interdiction_107255299': 0,
    'wordnet_judge_110225219': 11015,
    'wordnet_judiciary_108166187': 86,
    'wordnet_law_106532330': 598,
    'wordnet_law_108441203': 1891,
    'wordnet_lawyer_110249950': 24872,
    'wordnet_legal_code_106667792': 186,
    'wordnet_legal_document_106479665': 5731,
    'wordnet_legislation_106535222': 992,
    'wordnet_legislative_act_106564387': 189,
    'wordnet_legislature_108163273': 1491,
    'wordnet_liability_114530403': 0,
    'wordnet_liberty_113994456': 0,
    'wordnet_liberty_113996061': 0,
    'wordnet_limited_company_108185211': 0,
    'wordnet_mandate_106556481': 18,
    'wordnet_obligation_106773150': 0,
    'wordnet_party_110402824': 219,
    'wordnet_permission_106689297': 176,
    'wordnet_pleading_106559365': 16,
    'wordnet_prerogative_105178715': 0,
    'wordnet_privilege_105158296': 3,
    'wordnet_privilege_105179567': 0,
    'wordnet_proclamation_101266491': 0,
    'wordnet_prohibition_106542047': 16,
    'wordnet_right_104850341': 0,
    'wordnet_right_105174653': 2092,
    'wordnet_right_113341756': 0,
    'wordnet_treaty_106773434': 2532,
    'wordnet_written_agreement_106771653': 2569,
}

In [27]:
for previous_count_node, previous_count in previous_counts.iteritems():
    assert previous_count_node in counts
    if previous_count != total_counts[previous_count_node]:
        print 'Difference in node {} of {}'.format(
            previous_count_node, previous_count - total_counts[previous_count_node])


Difference in node wordnet_treaty_106773434 of 48
Difference in node wordnet_court_108329453 of 9
Difference in node wordnet_corporation_108059412 of 35
Difference in node wordnet_code_of_conduct_105668095 of 7
Difference in node wordnet_foundation_108406486 of 141
Difference in node wordnet_directive_107170080 of 42
Difference in node wordnet_mandate_106556481 of 1
Difference in node wordnet_written_agreement_106771653 of 47
Difference in node wordnet_legislation_106535222 of 1
Difference in node wordnet_legislative_act_106564387 of 73
Difference in node wordnet_right_105174653 of 44
Difference in node wordnet_judiciary_108166187 of 19
Difference in node wordnet_cooperative_101100877 of 83
Difference in node wordnet_civil_liberty_113995662 of 7
Difference in node wordnet_association_108049401 of 11
Difference in node wordnet_company_108058098 of 1167
Difference in node wordnet_law_106532330 of 54
Difference in node wordnet_law_108441203 of 101
Difference in node wordnet_permission_106689297 of 1
Difference in node wordnet_judge_110225219 of 4374
Difference in node wordnet_decree_106539770 of 2
Difference in node wordnet_legal_document_106479665 of 771
Difference in node wordnet_legislature_108163273 of 14

Mapping to high level classes


In [46]:
HL_CLASSES_MAP = {
    'wordnet_code_of_conduct_105668095': 'abstraction',
    'wordnet_legal_code_106667792': 'document',
    'wordnet_law_106532330': 'document',
    'wordnet_law_108441203': 'document',
    'wordnet_legislation_106535222': 'document',
    'wordnet_authorization_101138670': 'act',
    'wordnet_proclamation_101266491': 'act',
    'wordnet_decree_106539770': 'document',
    'wikicat_Legal_doctrines_and_principles': 'abstraction',
    'wordnet_common_law_108453722': 'abstraction',
    'wordnet_case_law_106535035': 'abstraction',
    'wordnet_privilege_105179567': 'abstraction',
    'wordnet_prerogative_105178715': 'abstraction',
    'wordnet_privilege_105158296': 'abstraction',
    'wordnet_liberty_113994456': 'abstraction',
    'wordnet_liberty_113996061': 'abstraction',
    'wordnet_civil_liberty_113995662': 'abstraction',
    'wordnet_contract_106520944': 'document',
    'wordnet_legislative_act_106564387': 'act',
    'wordnet_exemption_100213903': 'act',
    'wordnet_treaty_106773434': 'document',
    'wordnet_legal_document_106479665': 'document',
    'wordnet_written_agreement_106771653': 'document',
    'wordnet_criminal_record_106490173': 'document',
    'wordnet_mandate_106556481': 'document',
    'wordnet_right_113341756': 'abstraction',
    'wordnet_obligation_106773150': 'document',
    'wordnet_permission_106689297': 'abstraction',
    'wordnet_liability_114530403': 'abstraction',
    'wordnet_indebtedness_114490319': 'abstraction',
    'wordnet_right_105174653': 'abstraction',
    'wordnet_right_104850341': 'abstraction',
    'wordnet_directive_107170080': 'act',
    'wordnet_interdiction_107255299': 'act',
    'wordnet_ban_107255401': 'act',
    'wordnet_prohibition_106542047': 'document',
    'wordnet_limited_company_108185211': 'organization',
    'wordnet_association_108049401': 'organization',
    'wordnet_corporation_108059412': 'organization',
    'wordnet_court_108329453': 'organization',
    'wordnet_foundation_108406486': 'organization',
    'wordnet_cooperative_101100877': 'organization',
    'wordnet_legislature_108163273': 'organization',
    'wordnet_delegating_101140839': 'act',
    'wordnet_pleading_106559365': 'act',
    'wordnet_company_108058098': 'organization',
    'wordnet_party_110402824': 'person',
    'wordnet_judge_110225219': 'person',
    'wordnet_judiciary_108166187': 'person',
    'wordnet_lawyer_110249950': 'person',
}

In [30]:
HL_CLASSES = sorted(set(HL_CLASSES_MAP.values()))
print HL_CLASSES


['abstraction', 'act', 'document', 'organization', 'person']

In [32]:
for category in CATEGORIES:
    assert category in HL_CLASSES_MAP

Check consistency with LKIF classes


In [37]:
LKIF_HL_CLASSES = defaultdict(set)
for lkif_category, yago_categories in LKIF_TO_YAGO.iteritems():
    for yago_category in yago_categories:
        LKIF_HL_CLASSES[lkif_category].add(HL_CLASSES_MAP[yago_category])
for lkif_category, hl_classes in LKIF_HL_CLASSES.iteritems():
    if len(hl_classes) != 1:
        print "Inconsistent mapping", lkif_category, LKIF_TO_YAGO[lkif_category], hl_classes


Inconsistent mapping Prohibition [u'wordnet_prohibition_106542047', u'wordnet_ban_107255401', u'wordnet_interdiction_107255299'] set(['document', 'act'])

Build the mapping for all classes in the extended ontology


In [49]:
def add_children_to_map(mapping, category, graph):
    assert category in mapping
    for children in graph.successors(category):
        if children in mapping:
            if not mapping[children] == mapping[category]:
                print 'Difference in mapping with parent {} ({}) and child {} ({})'.format(
                    category, mapping[category], children, mapping[children])
        else:
            mapping[children] = mapping[category]
        add_children_to_map(mapping, children, graph)

In [50]:
for category in HL_CLASSES_MAP.keys():
    if category in graph.nodes():
        add_children_to_map(HL_CLASSES_MAP, category, graph)


Difference in mapping with parent wordnet_act_106532095 (document) and child wordnet_legislative_act_106564387 (act)

In [51]:
HL_CLASSES_MAP


Out[51]:
{'wikicat_Legal_doctrines_and_principles': 'abstraction',
 u'wordnet_accreditation_101140193': 'act',
 u'wordnet_accusation_107234230': 'act',
 u'wordnet_act_106532095': 'document',
 u'wordnet_advocate_109775663': 'person',
 u'wordnet_alcalde_109781804': 'person',
 u'wordnet_allegation_107236077': 'act',
 u'wordnet_appellate_court_108330106': 'organization',
 'wordnet_association_108049401': 'organization',
 'wordnet_authorization_101138670': 'act',
 'wordnet_ban_107255401': 'act',
 u'wordnet_barrister_109840963': 'person',
 u'wordnet_baseball_league_108231874': 'organization',
 u'wordnet_basketball_league_108232299': 'organization',
 u'wordnet_bill_106536853': 'document',
 u'wordnet_bond_113417410': 'document',
 u'wordnet_broadcasting_company_108002015': 'organization',
 u'wordnet_builder_109878275': 'person',
 u'wordnet_bull_106726761': 'document',
 u'wordnet_bus_company_108186761': 'organization',
 'wordnet_case_law_106535035': 'abstraction',
 u'wordnet_certification_101139830': 'act',
 u'wordnet_chamber_of_commerce_108319061': 'organization',
 u'wordnet_charge_106561942': 'act',
 u'wordnet_charity_108406619': 'organization',
 u'wordnet_check_113381734': 'document',
 u'wordnet_chess_club_108229275': 'organization',
 u'wordnet_chief_justice_109916788': 'person',
 u'wordnet_civil_law_108453464': 'document',
 'wordnet_civil_liberty_113995662': 'abstraction',
 u'wordnet_civil_right_105182563': 'abstraction',
 u'wordnet_closed_corporation_108383690': 'organization',
 u'wordnet_club_108227214': 'organization',
 'wordnet_code_of_conduct_105668095': 'abstraction',
 u'wordnet_commercial_treaty_106773857': 'document',
 'wordnet_common_law_108453722': 'abstraction',
 'wordnet_company_108058098': 'organization',
 u'wordnet_concession_106526619': 'document',
 u'wordnet_consortium_108236438': 'organization',
 'wordnet_contract_106520944': 'document',
 u'wordnet_contractor_109960688': 'person',
 u'wordnet_contractor_109960891': 'person',
 'wordnet_cooperative_101100877': 'organization',
 'wordnet_corporation_108059412': 'organization',
 'wordnet_court_108329453': 'organization',
 u'wordnet_court_order_106539502': 'document',
 u'wordnet_criminal_law_106539178': 'document',
 'wordnet_criminal_record_106490173': 'document',
 'wordnet_decree_106539770': 'document',
 u'wordnet_defense_attorney_110000158': 'person',
 'wordnet_delegating_101140839': 'act',
 u'wordnet_derivative_instrument_106480506': 'document',
 'wordnet_directive_107170080': 'act',
 u'wordnet_district_attorney_110019072': 'person',
 u'wordnet_doge_110023264': 'person',
 u'wordnet_dot-com_108002384': 'organization',
 u'wordnet_draft_113377268': 'document',
 u"wordnet_driver's_license_106550206": 'document',
 u'wordnet_drug_cartel_108236963': 'organization',
 u'wordnet_drug_company_108002578': 'organization',
 u'wordnet_electronics_company_108003035': 'organization',
 'wordnet_exemption_100213903': 'act',
 u'wordnet_family_108227916': 'organization',
 u'wordnet_family_business_108383909': 'organization',
 u'wordnet_federal_court_108332330': 'organization',
 u'wordnet_film_company_108003173': 'organization',
 u'wordnet_food_company_108003427': 'organization',
 u'wordnet_football_league_108232496': 'organization',
 'wordnet_foundation_108406486': 'organization',
 u'wordnet_franchise_106526811': 'document',
 u'wordnet_fraternity_108229467': 'organization',
 u'wordnet_fundamental_law_106533648': 'document',
 u'wordnet_furniture_company_108003525': 'organization',
 u'wordnet_gang_108244062': 'organization',
 u'wordnet_gas_company_108186655': 'organization',
 u'wordnet_glee_club_108229605': 'organization',
 u'wordnet_golf_club_108229694': 'organization',
 u'wordnet_government_bond_113338234': 'document',
 u'wordnet_hockey_league_108232603': 'organization',
 u'wordnet_holding_company_108185369': 'organization',
 u'wordnet_human_right_105176846': 'abstraction',
 u'wordnet_imperial_decree_106541167': 'document',
 'wordnet_indebtedness_114490319': 'abstraction',
 u'wordnet_institute_108407330': 'organization',
 'wordnet_interdiction_107255299': 'act',
 'wordnet_judge_110225219': 'person',
 'wordnet_judiciary_108166187': 'person',
 u'wordnet_justiciar_110228592': 'person',
 u'wordnet_kite_113382471': 'document',
 'wordnet_law_106532330': 'document',
 'wordnet_law_108441203': 'document',
 'wordnet_lawyer_110249950': 'person',
 u'wordnet_league_108231184': 'organization',
 'wordnet_legal_code_106667792': 'document',
 'wordnet_legal_document_106479665': 'document',
 'wordnet_legislation_106535222': 'document',
 'wordnet_legislative_act_106564387': 'act',
 'wordnet_legislature_108163273': 'organization',
 'wordnet_liability_114530403': 'abstraction',
 'wordnet_liberty_113994456': 'abstraction',
 'wordnet_liberty_113996061': 'abstraction',
 u'wordnet_license_106549661': 'document',
 'wordnet_limited_company_108185211': 'organization',
 u'wordnet_litigant_110266848': 'person',
 u'wordnet_livery_company_108186898': 'organization',
 u'wordnet_magistrate_110280945': 'person',
 'wordnet_mandate_106556481': 'document',
 u'wordnet_military_court_108334087': 'organization',
 u'wordnet_mining_company_108003619': 'organization',
 u'wordnet_money_order_113380820': 'document',
 u'wordnet_mover_108478482': 'organization',
 u'wordnet_negotiable_instrument_106481156': 'document',
 'wordnet_obligation_106773150': 'document',
 u'wordnet_oil_company_108069241': 'organization',
 u'wordnet_option_113241600': 'document',
 u'wordnet_ordinary_110382380': 'person',
 u'wordnet_packaging_company_108069342': 'organization',
 u'wordnet_parliament_108319198': 'organization',
 'wordnet_party_110402824': 'person',
 u'wordnet_pass_106691083': 'abstraction',
 u'wordnet_peace_106773976': 'document',
 'wordnet_permission_106689297': 'abstraction',
 u'wordnet_pipeline_company_108069487': 'organization',
 'wordnet_pleading_106559365': 'act',
 u'wordnet_poor_law_106538785': 'document',
 u'wordnet_power_company_108186393': 'organization',
 u'wordnet_praetor_110463028': 'person',
 'wordnet_prerogative_105178715': 'abstraction',
 u'wordnet_printing_concern_108069627': 'organization',
 'wordnet_privilege_105158296': 'abstraction',
 'wordnet_privilege_105179567': 'abstraction',
 u'wordnet_probate_court_108335087': 'organization',
 'wordnet_proclamation_101266491': 'act',
 u'wordnet_professional_association_108242675': 'organization',
 'wordnet_prohibition_106542047': 'document',
 u'wordnet_prosecutor_110484858': 'person',
 u'wordnet_public_defender_110490557': 'person',
 u'wordnet_racket_club_108230110': 'organization',
 'wordnet_right_104850341': 'abstraction',
 'wordnet_right_105174653': 'abstraction',
 'wordnet_right_113341756': 'abstraction',
 u'wordnet_rowing_club_108230219': 'organization',
 u'wordnet_secret_society_108235343': 'organization',
 u'wordnet_security_113416345': 'document',
 u'wordnet_senate_108161477': 'organization',
 u'wordnet_serjeant-at-law_110581890': 'person',
 u'wordnet_service_108186047': 'organization',
 u'wordnet_shipping_company_108003717': 'organization',
 u'wordnet_sorority_108230477': 'organization',
 u"wordnet_state's_attorney_110649962": 'person',
 u'wordnet_statute_of_limitations_106533484': 'document',
 u'wordnet_steel_company_108003839': 'organization',
 u'wordnet_subsidiary_company_108003935': 'organization',
 u'wordnet_superior_court_108335751': 'organization',
 u'wordnet_supreme_court_108336188': 'organization',
 u'wordnet_telephone_company_108186221': 'organization',
 u'wordnet_tennis_club_108230590': 'organization',
 u'wordnet_think_tank_108478702': 'organization',
 u'wordnet_transportation_company_108004089': 'organization',
 'wordnet_treaty_106773434': 'document',
 u'wordnet_trial_attorney_110728361': 'person',
 u'wordnet_trial_judge_110728523': 'person',
 u'wordnet_trucking_company_108004210': 'organization',
 u'wordnet_trust_108236621': 'organization',
 u'wordnet_utility_108185758': 'organization',
 u'wordnet_warrant_106547059': 'document',
 u'wordnet_water_company_108186546': 'organization',
 u'wordnet_writ_106552984': 'document',
 'wordnet_written_agreement_106771653': 'document',
 u'wordnet_yacht_club_108230785': 'organization'}

In [52]:
utils.pickle_to_file(HL_CLASSES_MAP, '../../data/hl_classes_mapping.pickle')

In [ ]: